version 8.2
capture clear
capture log close
set more off
set mem 510m
set mat 800

use "grades.dta", replace	


***calculate the quiz grades by group:

forvalues i=1(1) 1{
	*indicator one if the student did the quiz
	gen one=1 if quiz`i'~=.

	egen group_quiz`i'=sum(quiz`i'), by(group_T term)
	replace group_quiz`i'=group_quiz`i'-quiz`i' if quiz`i'~=.
	replace group_quiz`i'=group_quiz`i' if quiz`i'==.

	egen number_group_quiz`i'=sum(one), by(group_T term)
	replace number_group_quiz`i'=number_group_quiz`i'-1  if quiz`i'~=.
	replace number_group_quiz`i'=number_group_quiz`i' if quiz`i'==.
	gen mean_group_quiz`i'=group_quiz`i'/number_group_quiz`i'
	sum mean_group_quiz`i'
	drop one
	}




*sort term group_T
*browse studentname group_T term quiz1 one group_quiz1 number_group_quiz1 mean_group_quiz1


*browse id term mean_group_quiz1 sd_mean_group_quiz1 mean_group_quiz1_st	
	

forvalues i=3(1) 3{
	gen one=1 if quiz`i'~=.

	egen group_quiz`i'=sum(quiz`i'), by(group_P term)
	replace group_quiz`i'=group_quiz`i'-quiz`i' if quiz`i'~=.
	replace group_quiz`i'=group_quiz`i' if quiz`i'==.

	egen number_group_quiz`i'=sum(one), by(group_P term)
	replace number_group_quiz`i'=number_group_quiz`i'-1  if quiz`i'~=.
	replace number_group_quiz`i'=number_group_quiz`i' if quiz`i'==.
	gen mean_group_quiz`i'=group_quiz`i'/number_group_quiz`i'
	sum mean_group_quiz`i'
	drop one
	}



****calculate variance excluding self:
gen sd_group_quiz1=.
gen temp_quiz1=quiz1
local x=_N
forvalues i=1(1) `x'{
	replace temp_quiz1=. in `i'
	egen temp_sd_group_quiz1=sd(temp_quiz1), by(group_T term)
	replace sd_group_quiz1=temp_sd_group_quiz1 if _n == `i'
	replace temp_quiz1=quiz1
	drop temp_sd_group_quiz1
	}
drop temp_quiz1


****calculate variance excluding self, on quiz3:
gen sd_group_quiz3=.
gen temp_quiz3=quiz3
local x=_N
forvalues i=1(1) `x'{
	replace temp_quiz3=. in `i'
	egen temp_sd_group_quiz3=sd(temp_quiz3), by(group_P term)
	replace sd_group_quiz3=temp_sd_group_quiz3 if _n == `i'
	replace temp_quiz3=quiz3
	drop temp_sd_group_quiz3
	}
drop temp_quiz3

gen Fall2011=(term=="Fall 2011")
gen Fall2012_001=(term=="Fall 2012_001")
gen Fall2012_002=(term=="Fall 2012_002")
*browse id term Fall201*

count
drop if id==.
count

***Another intervention (to be studied in another paper):
gen email_T_dummy=1 if email_T>0
replace email_T_dummy=0 if email_T==0
gen email_P_dummy=1 if email_P>0
replace email_P_dummy=0 if email_P==0

****interventions at the group level:
egen email_T_dummy_group=max(email_T_dummy), by(group_T term)
replace email_T_dummy_group=1 if email_T_dummy_group>0&email_T_dummy_group~=.
egen email_P_dummy_group=max(email_P_dummy), by(group_P term)
replace email_P_dummy_group=1 if email_P_dummy_group>0&email_P_dummy_group~=.


*browse term id email_T* email_P*

*****we create a dummy for bad students (less than median)
gen bad_50_T=0
gen good_50_T=0

replace term="Fall2010" if term=="Fall 2010"
replace term="Fall2011" if term=="Fall 2011"
replace term="Fall2012_001" if term=="Fall 2012_001"
replace term="Fall2012_002" if term=="Fall 2012_002"

local list_terms Fall2010 Fall2011 Fall2012_001 Fall2012_002
foreach var of local list_terms{

gen temp = .
replace temp=quiz1 if term=="`var'"
centile(temp), centile(25 50 75)
replace bad_50_T=1 if quiz1<=`r(c_2)'&term=="`var'"
replace good_50_T=1 if quiz1>`r(c_2)'&term=="`var'"&quiz1~=.

drop temp

}

*browse quiz1 bad_50 good_50




*****same for presentation: we create a dummy for bad students (less than median)
gen bad_50_P=0
gen good_50_P=0


local list_terms Fall2010 Fall2011 Fall2012_001 Fall2012_002
foreach var of local list_terms{

gen temp = .
replace temp=quiz3 if term=="`var'"
centile(temp), centile(25 50 75)
replace bad_50_P=1 if quiz3<=`r(c_2)'&term=="`var'"
replace good_50_P=1 if quiz3>`r(c_2)'&term=="`var'"&quiz3~=.
drop temp

}



***generate a unique identifier for the groups:
tostring group_T, replace
gen term_group_T=term+"_T_"+group_T
*browse term group_T term_group_T
tostring group_P, replace
gen term_group_P=term+"_P_"+group_P



****now classify the major of people:
tab major


***could do a classification by field
***could a classification by level (honours better than major)
***could keep the complete classification, and calculate a diversity index

gen field=1 if major=="Intnl Development Studies -Con"|major=="Intnl Development Studies -Con * "|major=="Intnl Development Studies -HC * "|major=="Intnl Development Studies -Hon"|major=="Intnl Development Studies -Hon * "|major=="African Studies -Con * "
replace field=2 if major=="Canadian Studies -Con"|major=="East Asian Studies -Con"|major=="East Asian Studies -Con * "|major=="English -Con"|major=="English -Con * "|major=="Middle East Studies -Con"|major=="Middle East Studies -HC * "|major=="Philosophy -HC * "|major=="Religious Studies -Hon"|major=="Sociology -Con"|major=="TESL Elementary and Secondary"|major=="Sustainability, Sci & Soc -IFP"|major=="Geography (AR)-Con"|major=="Geography (AR)-Con * "|major=="Geography (AR)-Con * "|major=="Geography -Hon"|major=="History -Con"|major=="Humanistic Studies -Con * "|major=="Latin American Studies -Con"|major=="Linguistics -Con * "
replace field=3 if major=="Anthropology -Con"|major=="Anthropology -Con * "|major=="Anthropology -HC * "
replace field=4 if major=="Industrial Relations -FP"|major=="General Management"|major=="International Management"|major=="Marketing"|major=="International Management * "|major=="Finance"
replace field=5 if major=="Political Science -Con"|major=="Political Science -Con * "|major=="Political Science -HC * "|major=="Political Science -Hon"|major=="Political Science -Hon * "
replace field=6 if major=="Agricultural Economics"|major=="Economics * "|major=="Economics -Con"|major=="Economics -Con * "|major=="Economics -HC * "|major=="Economics -Hon"|major=="Economics -Hon * "
replace field=7 if major=="Environment"|major=="Environment -FP"|major=="Environment -FP * "|major=="Environment -Hon"|major=="Environment -IFP"|major=="Environmental Biology"
replace field=8 if major=="General Kinesiology"|major=="Nursing"|major=="Physiology"|major=="Psychology -Hon"|major=="Psychology -Con"|major=="Psychology -Con * "|major=="Anatomy and Cell Biology"|major=="Biology -CSC"|major=="Biology -Con"|major=="Biology -Con * "
replace field=9 if major=="Biomedical Sciences -Con"|major=="Biomedical Sciences -Con * "|major=="Cognitive Science -IFP"|major=="Computer Science -Con * "|major=="Neuroscience"|major=="Chemical Engineering"|major=="Civil Engineering"|major=="Computer Engineering"|major=="Electrical Engineering"|major=="Chemistry"|major=="Cognitive Science -Hon"|major=="Materials Engineering CO-OP"|major=="Mining Engrg: Co-op Pgm"
replace field=10 if major=="Math for Management -Con * "|major=="Mathematics"|major=="Mathematics -Con * "
replace field=11 if major=="Exchange"|major=="Undeclared"|major=="Freshman Program"

*label define field_name 1 IDS 2 Humanities 3 Anthropology 4 Business 5 Political_Science 6 Economics 7 Environment 8 Medicine_Biology 9 Science_Engineering 10 Math 11 Other
*label values field field_name
sort major
*browse major field if field==.

gen fshort=1 if field==1|field==2|field==3
replace fshort=2 if field==4
replace fshort=3 if field==5
replace fshort=4 if field==6
replace fshort=5 if field==7|field==8|field==9|field==10
replace fshort=6 if field==11

*label define fieldshort_name 1 Arts 2 Business 3 Political_Science 4 Economics 5 Science 6 Other
*label values fshort fieldshort_name
sort major
*browse major fshort

****calculate the proportion of guys from these fields in the rest of my group:
sort term group_T position_number_T
*browse term group_T position_number_T quiz1 field


***count fields by group:
sort term group_T field 	
*browse term group_T field 
gen div_temp=0
replace div_temp=1 if field~=field[_n-1]&term==term[_n-1]&group_T==group_T[_n-1]
egen div_group_field_T=sum(div_temp), by(term group_T)
replace div_group_field_T=div_group_field_T+1
drop div_temp
*browse term group_T field div_group_field_T

sort term group_P field 	
*browse term group_P field 
gen div_temp=0
replace div_temp=1 if field~=field[_n-1]&term==term[_n-1]&group_P==group_P[_n-1]
egen div_group_field_P=sum(div_temp), by(term group_P)
replace div_group_field_P=div_group_field_P+1
drop div_temp
*browse term group_P field div_group_field_P
	
	
****Test of randomization from Golf paper: Calculate the mean quiz of the whole class		
gen one=1
forvalues i=1(1) 4{
	egen class_quiz`i'=sum(quiz`i'), by(term)
	replace class_quiz`i'=class_quiz`i'-quiz`i'

	egen number_class_quiz`i'=sum(one), by(term)
	replace number_class_quiz`i'=number_class_quiz`i'-1
	gen mean_class_quiz`i'=class_quiz`i'/number_class_quiz`i'
	sum mean_class_quiz`i'
	}


*****students characteristics:

label variable origine1 "Origin (1=East Asian, 2=Black, 3=South Asian, 4=Caucasian, 5=Arab)"
label variable origine2 "Origin (1=East Asian, 2=Black, 3=South Asian, 4=Caucasian, 5=Arab)"

corr origine1 origine2
*good correlation

label variable sexe3 "Gender (0=Female, 1=Male)"
label variable nom3 "Name (1=French, 2=English, 3=Other)"
label variable sourire3 "Smile (0=No smile, 1=Normal Smile, 2=Big smile (ultra-bright)"
label variable origin3 "Origin (1=East Asian, 2=Black, 3=South Asian, 4=Caucasian, 5=Middle Eastern)"
	
***create diversity indices with nom3 and origin3	

****measure of individual remoteness to others in group: for origin3
gen origin4=0
replace origin4=1 if origin3==1
replace origin4=4 if origin3==2
replace origin4=2 if origin3==3
replace origin4=5 if origin3==4
replace origin4=3 if origin3==5

label define origin_name 1 East_Asian 2 South_Asian 3 Middle_Eastern 4 Black 5 Caucasian
label values origin4 origin_name


***count origin4 by group:

local list_variables origin4
foreach var of local list_variables{

sort term group_T `var' 	
*browse term group_T `var' 
gen div_temp=0
replace div_temp=1 if `var'~=`var'[_n-1]&term==term[_n-1]&group_T==group_T[_n-1]
egen div_group_`var'_T=sum(div_temp), by(term group_T)
replace div_group_`var'_T=div_group_`var'_T+1
drop div_temp
*browse term group_T `var' div_group_`var'_T

sort term group_P `var' 	
*browse term group_P `var' 
gen div_temp=0
replace div_temp=1 if `var'~=`var'[_n-1]&term==term[_n-1]&group_P==group_P[_n-1]
egen div_group_`var'_P=sum(div_temp), by(term group_P)
replace div_group_`var'_P=div_group_`var'_P+1
drop div_temp
*browse term group_P `var' div_group_`var'_P

}


***controls:

*controls:
*label variable sexe3 "Gender (0=Female, 1=Male)"
*label variable nom3 "Name (1=French, 2=English, 3=Other)"
*label variable sourire3 "Smile (0=No smile, 1=Normal Smile, 2=Big smile (ultra-bright)"


gen french_name=1 if nom3==1
replace french_name=0 if nom3~=1
label variable french_name "Francophone name"
gen english_name=1 if nom3==2
replace english_name=0 if nom3~=2
label variable english_name "Anglophone name"

gen smile=1 if sourire3==2
replace smile=0 if sourire3~=2
label variable smile "Smile"



***number female in groups:
gen female=1-sexe3
label variable female "Female (0,1)"

*number_female: number female in groups


drop one
gen one=1
egen number_group_T=sum(one), by(group_T term)
egen number_female_T=sum(female), by(group_T term)

*sort group_T term
*browse group_T term female number_group_T number_female_T

egen number_group_P=sum(one), by(group_P term)
egen number_female_P=sum(female), by(group_P term)
drop one

*browse number_group_P number_female_P
	
****class:
gen class_clean=.

forvalues i=0(1) 4{
	replace class_clean=`i' if strpos(class,"`i'")>0
	}
	
*browse class class_clean
tab class_clean
*browse class class_clean if class_clean==.
*if class=="Undergraduate Non-Degree", consider that as a 3, because these students are mature
replace class_clean=3 if class=="Undergraduate Non-Degree"
*browse class class_clean if class_clean==.
	
gen third_fourth=1 if class_clean==3|class_clean==4
replace third_fourth=0 if third_fourth==.

egen number_third_fourth_T=sum(third_fourth), by(group_T term)


*sort group_T term
*browse group_T term female number_group_T number_female_T

egen number_third_fourth_P=sum(third_fourth), by(group_P term)

	
*****controls:
tab field
tab field, gen(fielddum)

label variable fielddum1 "IDS"
label variable fielddum2 "Humanities"
label variable fielddum3 "Anthropology"
label variable fielddum4 "Business"
label variable fielddum5 "Political_Science"
label variable fielddum6 "Economics"
label variable fielddum7 "Environment"
label variable fielddum8 "Medicine_Biology"
label variable fielddum9 "Science_Engineering"
label variable fielddum10 "Math"
label variable fielddum11 "Other"



tab fshort
tab fshort, gen(fshortdum)

label variable fshortdum1 "Humanities"
label variable fshortdum2 "Business"
label variable fshortdum3 "Political_Science"
label variable fshortdum4 "Economics"
label variable fshortdum5 "Science"
label variable fshortdum6 "Others"

*label variable origin3 "Origin (1=East Asian, 2=Black, 3=South Asian, 4=Caucasian, 5=Middle Eastern)"
tab origin3, gen(origin3_dum)
label variable origin3_dum1 "East Asian"
label variable origin3_dum2 "Black"
label variable origin3_dum3 "South Asian"
label variable origin3_dum4 "Caucasian"
label variable origin3_dum5 "Middle Eastern"

tab class_clean, gen(class_clean_dum)	
label variable class_clean_dum1 "First year"
label variable class_clean_dum2 "Second year"
label variable class_clean_dum3 "Third year"
label variable class_clean_dum4 "Fourth year"


***these students dropped the course:
drop if id==739551609|id==999999998|id==740000000|id==739583739|id==739539009|id==739551609

drop if id==.

save "data_for_regression.dta", replace

exit
